{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from utils import tokenize, load_curpus\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 加载数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache /tmp/jieba.cache\n",
      "Loading model cost 0.429 seconds.\n",
      "Prefix dict has been built succesfully.\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "train_data = load_curpus(\"weibo2018/train.txt\")\n",
    "test_data = load_curpus(\"weibo2018/test.txt\")\n",
    "train_df = pd.DataFrame(train_data, columns=[\"content\", \"sentiment\"])\n",
    "test_df = pd.DataFrame(test_data, columns=[\"content\", \"sentiment\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "加载停用词"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "stopwords = []\n",
    "with open(\"stopwords.txt\", \"r\", encoding=\"utf8\") as f:\n",
    "    for w in f:\n",
    "        stopwords.append(w.strip())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 加载之前训练好的FastText模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from gensim.models import FastText\n",
    "model = FastText.load(\"model/model_100.txt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "max_length = 128"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 为保证输入神经网络的向量长度一致, 要对长度不足max_length的句子用零向量补齐, 对长度超过max_length的句子进行截断"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/albert/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:5: DeprecationWarning: Call to deprecated `__contains__` (Method will be removed in 4.0.0, use self.wv.__contains__() instead).\n",
      "  \"\"\"\n",
      "/home/albert/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:6: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).\n",
      "  \n"
     ]
    }
   ],
   "source": [
    "X_train, train_length, y_train = [], [], []\n",
    "for content, sentiment in train_data:\n",
    "    X, y = [], sentiment\n",
    "    for w in content[:max_length]:\n",
    "        if w in model:\n",
    "            X.append(np.expand_dims(model[w], 0))\n",
    "    if X:\n",
    "        length = len(X)\n",
    "        X = X + [np.zeros_like(X[0])] * (max_length - length)\n",
    "        X = np.concatenate(X)\n",
    "        X = np.expand_dims(X, 0)\n",
    "        X_train.append(X)\n",
    "        train_length.append(length)\n",
    "        y_train.append(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/albert/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:5: DeprecationWarning: Call to deprecated `__contains__` (Method will be removed in 4.0.0, use self.wv.__contains__() instead).\n",
      "  \"\"\"\n",
      "/home/albert/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:6: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).\n",
      "  \n"
     ]
    }
   ],
   "source": [
    "X_test, test_length, y_test = [], [], []\n",
    "for content, sentiment in test_data:\n",
    "    X, y = [], sentiment\n",
    "    for w in content[:max_length]:\n",
    "        if w in model:\n",
    "            X.append(np.expand_dims(model[w], 0))\n",
    "    if X:\n",
    "        length = len(X)\n",
    "        X = X + [np.zeros_like(X[0])] * (max_length - length)\n",
    "        X = np.concatenate(X)\n",
    "        X = np.expand_dims(X, 0)\n",
    "        X_test.append(X)\n",
    "        test_length.append(length)\n",
    "        y_test.append(y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Attention+BiLSTM\n",
    "由于tensorflow的Attention是基于Seq2Seq结构的，我这里也采用了这种结构，不过并没有使用“解码器”"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/albert/anaconda3/lib/python3.6/site-packages/h5py/__init__.py:34: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
      "  from ._conv import register_converters as _register_converters\n"
     ]
    }
   ],
   "source": [
    "import tensorflow as tf\n",
    "from tensorflow.contrib import rnn, seq2seq\n",
    "batch_size = 512\n",
    "lr = 1e-3\n",
    "hidden_size = 100\n",
    "\n",
    "X = tf.placeholder(shape=(batch_size, max_length, 100), dtype=tf.float32, name=\"X\")\n",
    "L = tf.placeholder(shape=(batch_size), dtype=np.int32, name=\"L\")\n",
    "y = tf.placeholder(shape=(batch_size, 1), dtype=np.float32, name=\"y\")\n",
    "dropout = tf.placeholder(shape=(), dtype=np.float32, name=\"dropout\")\n",
    "\n",
    "with tf.variable_scope(\"lstm\", reuse=tf.AUTO_REUSE):\n",
    "    def lstm_cell(hidden_size, cell_id=0):\n",
    "        # LSTM细胞生成器\n",
    "        cell = rnn.LSTMCell(hidden_size, reuse=tf.AUTO_REUSE, name='cell%d' % cell_id)\n",
    "        cell = rnn.DropoutWrapper(cell, output_keep_prob=dropout)\n",
    "        return cell\n",
    "    \n",
    "    context = tf.get_variable(\"context\", shape=(1, hidden_size))\n",
    "    context = tf.tile(context, [batch_size, 1])\n",
    "    \n",
    "    # BiLSTM部分\n",
    "    fw_cell = lstm_cell(hidden_size, 0)\n",
    "    bw_cell = lstm_cell(hidden_size, 1)\n",
    "    fw_zero = fw_cell.zero_state(batch_size, tf.float32)\n",
    "    bw_zero = fw_cell.zero_state(batch_size, tf.float32)\n",
    "    \n",
    "    # Seq2Seq版的dynamic_rnn\n",
    "    encoder_output, encoder_states = tf.nn.bidirectional_dynamic_rnn(cell_fw=fw_cell,\n",
    "                                                         cell_bw=bw_cell,\n",
    "                                                         inputs=X,\n",
    "                                                         sequence_length=L,\n",
    "                                                         initial_state_fw=fw_zero,\n",
    "                                                         initial_state_bw=bw_zero,\n",
    "                                                         dtype=tf.float32)\n",
    "    \n",
    "    # Attention模块\n",
    "    attention_context = tf.concat(encoder_output, axis=2)\n",
    "    attention_mech = seq2seq.BahdanauAttention(hidden_size * 2,\n",
    "                                                 memory=attention_context,\n",
    "                                                 memory_sequence_length=L,\n",
    "                                                 name=\"AttentionMechanism\")\n",
    "    attention_cell = seq2seq.AttentionWrapper(cell=lstm_cell(hidden_size, 2),\n",
    "                                                attention_mechanism=attention_mech,\n",
    "                                                attention_layer_size=hidden_size,\n",
    "                                                alignment_history=True,\n",
    "                                                output_attention=True,\n",
    "                                                name=\"AttentionCell\")\n",
    "    \n",
    "    # Attention加权得到的context向量\n",
    "    attention_zero = attention_cell.zero_state(batch_size, tf.float32)\n",
    "    attention_output, attention_state = attention_cell.call(context, attention_zero)\n",
    "    aligments = attention_state[3]\n",
    "    \n",
    "    # 用context向量直接用MLP做二分类\n",
    "    W1 = tf.get_variable(\"W1\", shape=(hidden_size, 50))\n",
    "    b1 = tf.get_variable(\"b1\", shape=(50,))\n",
    "    W2 = tf.get_variable(\"W2\", shape=(50, 1))\n",
    "    b2 = tf.get_variable(\"b2\", shape=(1,))\n",
    "    fcn1 = tf.nn.xw_plus_b(attention_output, W1, b1)\n",
    "    logists = tf.nn.xw_plus_b(fcn1, W2, b2)\n",
    "    loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logists, labels=y))     # 交叉熵\n",
    "    op = tf.train.AdamOptimizer(lr).minimize(loss)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.6)\n",
    "config = tf.ConfigProto(gpu_options=gpu_options)\n",
    "sess = tf.Session(config=config)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "step: 0  loss: 0.9181123\n",
      "step: 100  loss: 0.3422089\n",
      "step: 200  loss: 0.13662013\n",
      "step: 300  loss: 0.060159855\n",
      "step: 400  loss: 0.013493343\n",
      "step: 500  loss: 0.12087765\n",
      "step: 600  loss: 0.010414938\n",
      "step: 700  loss: 0.0016351113\n",
      "step: 800  loss: 0.00053552777\n",
      "step: 900  loss: 0.0037728162\n",
      "step: 1000  loss: 0.008088736\n"
     ]
    }
   ],
   "source": [
    "total_step = 1001\n",
    "step = 0\n",
    "cursor = 0\n",
    "sess.run(tf.global_variables_initializer())\n",
    "saver = tf.train.Saver(max_to_keep=1)\n",
    "while step < total_step:\n",
    "    _X, _L, _y = X_train[cursor: cursor + batch_size], train_length[cursor: cursor + batch_size], y_train[cursor: cursor + batch_size]\n",
    "    cursor += batch_size\n",
    "    if len(_X) < batch_size:\n",
    "        cursor = batch_size - len(_X)\n",
    "        _X += X_train[: cursor]\n",
    "        _L += train_length[: cursor]\n",
    "        _y += y_train[: cursor]\n",
    "    _X = np.concatenate(_X)\n",
    "    _L = np.reshape(np.array(_L, dtype=np.int32), (-1))\n",
    "    _y = np.reshape(np.array(_y, dtype=np.float32), (batch_size, 1))\n",
    "    _, l = sess.run([op, loss], feed_dict={X: _X, L:_L, y: _y, dropout:.75})\n",
    "    if step % 100 == 0:\n",
    "        print(\"step:\", step, \" loss:\", l)\n",
    "        saver.save(sess,'model/attention/model', global_step=step)\n",
    "    step += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "_X = np.concatenate(X_test + [np.zeros_like(X_test[0])] * (batch_size - len(X_test)))\n",
    "_L = np.array(test_length + [1] * (batch_size - len(test_length)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "result = sess.run(tf.nn.sigmoid(logists), feed_dict={X: _X, L: _L, dropout:1.})\n",
    "prediction = []\n",
    "for i in result[:len(X_test)]:\n",
    "    if i > 0.5:\n",
    "        prediction.append(1)\n",
    "    else:\n",
    "        prediction.append(0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 效果测评\n",
    "相比纯LSTM提升没那么明显，主要是因为该任务相对简单，语料少。迁移至更复杂任务后注意力的作用会越来越明显"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "          0       0.79      0.75      0.77       155\n",
      "          1       0.89      0.91      0.90       345\n",
      "\n",
      "avg / total       0.86      0.86      0.86       500\n",
      "\n",
      "准确率: 0.86\n"
     ]
    }
   ],
   "source": [
    "from sklearn import metrics\n",
    "print(metrics.classification_report(y_test, prediction))\n",
    "print(\"准确率:\", metrics.accuracy_score(y_test, prediction))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
